## Rows: 104
## Columns: 5
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ dept_id <dbl> 6, 4, 3, 4, 2, 6, 3, NA, 3, 6, 6, 6, 4, 4, 6, 3, 5, 4, 4,...
## $ dept_name <chr> "Production", "Business Development", "Design", "Business...
## $ name <chr> "B****, Li*** N.", "C******, Jane G.", "E******, Ma* B.",...
## $ salary <chr> "91100", "173900", "163100", "71000", "111100", "80600", ...
#simple cleaning
#a = a %>% select(-id, -name)
a = a %>% mutate(
salary = as.numeric(salary),
dept_id = as.factor(dept_id)
)
#dept_id should be a factor/category variable
#especially import to convert if imputation is done
a %>% heada %>% count(dept_name, dept_id, name = 'count') %>% mutate(percent = count/sum(count)) %>% arrange(dept_name, -percent)#correct dept misspellings
a = a %>% mutate(
dept_name = if_else(dept_name == 'Business Developement', 'Business Development', dept_name),
dept_name = if_else(dept_name == 'Producdion', 'Production', dept_name),
) %>% mutate(
dept_name = factor(dept_name)
)
#check
a %>% select(dept_name) %>% freq(dept.mapping = tibble(
dept_id = a %>% pull(dept_id) %>% unique() %>% sort,
dept_name = c('Human Resources', 'Design', 'Business Development', 'Accounting', 'Production'),
))
But . . . there’s a faster and more automated way! Imputation via Machine Learning, specifically the Random Forest Algorithm (both Classification and Regression).
missing.dept_name.ids = a %>% filter(is.na(dept_name)) %>% pull(id)
# these rows are missing for dept_id
a %>% filter(is.na(dept_id))missing.dept_id.ids = a %>% filter(is.na(dept_id)) %>% pull(id)
# these rows are missing for salary
a %>% filter(is.na(salary))a.imputed %>% group_by(dept_name) %>% summarise(
mean.salary = mean(salary),
median.salary = median(salary)
)## `summarise()` ungrouping output (override with `.groups` argument)
TukeyHSD(anova.salary.dept)
TukeyHSD(anova.salary.dept) %>% tidy %>% filter(adj.p.value < 0.05) %>% select(contrast, adj.p.value)
Looking at the salary distribution by dept viz above, the results above make sense
The ‘Design’ dept clearly has a median pay above that of the other departments. Even more telling, its first quartile, at $103,250 is nearly higher than the median of 3 other departments.